'''
重命名之后的数据，需要进行数据清洗
这里我们只记录文书名，案件内容，裁定结果是批准还是驳回
将这些清洗好的数据放到mongodb中
'''

from docx import Document
from pymongo import MongoClient
import jieba
import os


def data_to_mongodb(db):
    '''将数据写入数据库库中'''
    # 遍历data目录下的文件，进行数据清洗后构建插入数据库的数据样式
    for dir_name in os.listdir("./data"):
        book_path = "./data/" + dir_name
        for LawBook in os.listdir(book_path):
            # 插入的数据格式
            data = {
                "num_id": LawBook,
                "title": None,
                "content": None,
                "label_content": None,
                "label": None
            }
            # 数据存取列表
            data_table = []
            
            # 数据读取路径
            path = book_path + "/" + LawBook
            print(path)
            # 开始打开文件进行处理
            f = open(path, 'rb')
            document = Document(f)
            for text in document.paragraphs:
                data_table.append(text)
            
            # 开始清洗数据并打包
            data['title'] = data_table[1].text
            for i in range(0, len(data_table)):
                if "裁定如下" in data_table[i].text:
                    data["content"] = data_table[i].text
                    data["label_content"] = data_table[i + 1].text
                    if "准许" in data_table[i + 1].text:
                        data["label"] = 1
                    elif "驳回" in data_table[i + 1].text:
                        data["label"] = 0
                    break
            # 将数据装入mongodb数据库中
            if not data["label"] == None:
                db.law_data.insert_one(data)

def clear(db):
    '''将数据进一步清洗，去掉无用的数据项'''
    for content in db.law_data.find():
        data = {
            'num_id': content['num_id'],
            'content': None,
            'label': content['label']
        }
        text_old = content['content']
        text_list = text_old.split('。')
        text_new = ''
        for i in text_list:
            if not '《' in i:
                text_new += i
                text_new += '。'
        data['content'] = text_new
        if not text_new == '':
            db.clear.insert_one(data)
            print(data['num_id'])

def train_test_class(db):
    '''将训练集和测试集的数据分开'''
    test_1_num = 5
    test_0_num = 56
    for item in db.clear.find():
        if item["label"] == 1 and test_1_num > 0:
            db.test.insert_one(item)
            test_1_num -= 1
            continue
        elif item["label"] == 0 and test_0_num > 0:
            db.test.insert_one(item)
            test_0_num -= 1
            continue
        db.train.insert_one(item)

def data_jieba(db):
    '''对数据进行jieba分词后另行存储'''
    punctuations = ['，', '。', '、', '；', '“', '”', '——', '—-']
    for item in db.train.find():
        data = {
            "num_id": item["num_id"],
            "content": None,
            "label": item["label"]
        }
        out = jieba.lcut(item["content"])
        out_new = ""
        for i in out:
            if not i in punctuations:
                out_new = out_new + " " + i
        data["content"] = out_new
        db.train_jieba.insert_one(data)
    
    for item in db.test.find():
        data = {
            "num_id": item["num_id"],
            "content": None,
            "label": item["label"]
        }
        out = jieba.lcut(item["content"])
        out_new = ""
        for i in out:
            if not i in punctuations:
                out_new = out_new + " " + i
        data["content"] = out_new
        db.test_jieba.insert_one(data)


if __name__ == "__main__":
    # 建立连接
    client = MongoClient()
    # 连接到数据库，没有会在插入数据时自动创建
    db = client["LawData"]

    # 将数据写入数据库中
    data_to_mongodb(db)

    # 将数据再次清洗
    clear(db)

    # 分出训练集和测试集的数据
    train_test_class(db)

    # 用jieba对数据分词并存储
    data_jieba(db)